In [6]:
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [7]:
import re
def rm_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', text)

In [8]:
import os
def read_files(filetype):
    path = "data/aclImdb/"
    file_list=[]
    
    positive_path=path + filetype + "/pos/"
    for f in os.listdir(positive_path):
        file_list+=[positive_path+f]
        
    negative_path=path + filetype + "/neg/"
    for f in os.listdir(negative_path):
        file_list+=[negative_path+f]
    
    print('read',filetype, 'files:',len(file_list))
    
    all_labels = ([1] * 12500 + [0] * 12500)
    
    all_texts = []
    for fi in file_list:
        with open(fi, encoding = 'utf8') as file_input:
            all_texts += [rm_tags(" ".join(file_input.readlines()))]
            
    return all_labels,all_texts

In [9]:
y_train,train_text=read_files("train")

read train files: 25000


In [10]:
y_test,test_text=read_files('test')

read test files: 25000


In [47]:
token = Tokenizer(num_words=3800)
token.fit_on_texts(train_text)

In [48]:
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)

In [49]:
x_train = sequence.pad_sequences(x_train_seq, maxlen=380)
x_test  = sequence.pad_sequences(x_test_seq,  maxlen=380)

In [50]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation,Flatten
from keras.layers.embeddings import Embedding

In [51]:
model = Sequential()

In [52]:
model.add(Embedding(output_dim=32,
                    input_dim=3800,
                    input_length=380))
model.add(Dropout(0.2))

In [53]:
model.add(Flatten())

In [54]:
model.add(Dense(units=256,
               activation='relu'))
model.add(Dropout(0.35))

In [55]:
model.add(Dense(units=1,
               activation='sigmoid'))

In [56]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 380, 32)           121600    
_________________________________________________________________
dropout_3 (Dropout)          (None, 380, 32)           0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 12160)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 256)               3113216   
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 257       
Total params: 3,235,073
Trainable params: 3,235,073
Non-trainable params: 0
_________________________________________________________________


In [57]:
model.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [58]:
train_history = model.fit(x_train, y_train, batch_size=100,
                         epochs=10,verbose=2,
                         validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
 - 32s - loss: 0.4732 - acc: 0.7591 - val_loss: 0.3492 - val_acc: 0.8508
Epoch 2/10
 - 31s - loss: 0.2008 - acc: 0.9215 - val_loss: 0.4287 - val_acc: 0.8288
Epoch 3/10
 - 33s - loss: 0.0769 - acc: 0.9769 - val_loss: 0.7124 - val_acc: 0.7758
Epoch 4/10
 - 39s - loss: 0.0280 - acc: 0.9923 - val_loss: 0.5238 - val_acc: 0.8612
Epoch 5/10
 - 36s - loss: 0.0158 - acc: 0.9959 - val_loss: 0.8162 - val_acc: 0.8090
Epoch 6/10
 - 37s - loss: 0.0111 - acc: 0.9968 - val_loss: 1.0496 - val_acc: 0.7806
Epoch 7/10
 - 36s - loss: 0.0096 - acc: 0.9974 - val_loss: 0.9755 - val_acc: 0.8090
Epoch 8/10
 - 37s - loss: 0.0117 - acc: 0.9962 - val_loss: 0.9495 - val_acc: 0.8078
Epoch 9/10
 - 40s - loss: 0.0157 - acc: 0.9943 - val_loss: 1.1557 - val_acc: 0.7870
Epoch 10/10
 - 35s - loss: 0.0165 - acc: 0.9946 - val_loss: 1.1992 - val_acc: 0.7900


In [59]:
scores = model.evaluate(x_test, y_test, verbose=1)
scores[1]



0.84467999999999999

In [26]:
predict=model.predict_classes(x_test)

In [27]:
predict[:10]

array([[1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1]])

In [28]:
predict_classes=predict.reshape(-1)
predict_classes[:10]

array([1, 1, 0, 1, 1, 1, 1, 1, 1, 1])

In [29]:
SentimentDict={1:'正面的',0:'負面的'}
def display_test_Sentiment(i):
    print(test_text[i])
    print('label真實值:',SentimentDict[y_test[i]],
         '預測結果:',SentimentDict[predict_classes[i]])

In [31]:
display_test_Sentiment(1)

Actor turned director Bill Paxton follows up his promising debut, the Gothic-horror "Frailty", with this family friendly sports drama about the 1913 U.S. Open where a young American caddy rises from his humble background to play against his Bristish idol in what was dubbed as "The Greatest Game Ever Played." I'm no fan of golf, and these scrappy underdog sports flicks are a dime a dozen (most recently done to grand effect with "Miracle" and "Cinderella Man"), but some how this film was enthralling all the same.The film starts with some creative opening credits (imagine a Disneyfied version of the animated opening credits of HBO's "Carnivale" and "Rome"), but lumbers along slowly for its first by-the-numbers hour. Once the action moves to the U.S. Open things pick up very well. Paxton does a nice job and shows a knack for effective directorial flourishes (I loved the rain-soaked montage of the action on day two of the open) that propel the plot further or add some unexpected psychologic

In [32]:
display_test_Sentiment(12502)

First of all I hate those moronic rappers, who could'nt act if they had a gun pressed against their foreheads. All they do is curse and shoot each other and acting like cliché'e version of gangsters.The movie doesn't take more than five minutes to explain what is going on before we're already at the warehouse There is not a single sympathetic character in this movie, except for the homeless guy, who is also the only one with half a brain.Bill Paxton and William Sadler are both hill billies and Sadlers character is just as much a villain as the gangsters. I did'nt like him right from the start.The movie is filled with pointless violence and Walter Hills specialty: people falling through windows with glass flying everywhere. There is pretty much no plot and it is a big problem when you root for no-one. Everybody dies, except from Paxton and the homeless guy and everybody get what they deserve.The only two black people that can act is the homeless guy and the junkie but they're actors by 

In [60]:
input_text='''
I am a huge fan of the original and I was thrilled when the cast was announced. I'm a big fan of Emma Watson and most of the other supporting actors so I went in with high hopes for this. It was awful! The CGI and auto-tuning were distracting and poorly done. On the subject of auto-tune- why did they insist Emma Watson do her own vocals when she clearly wasn't up to the task? Several other numbers, notably "Gaston" and "Be our Guest" fell flat. None of the charm or warmth of the original.
The performances were another issue for me which blows my mind considering the talent propping this horrid remake up. I can't fathom who approved the accents of Ewen Mcgregor and Emma Thompson. They were BAD. Emma Watson's performance was not what I expected from her. It was like she was trying but missing the mark time and again. Her Belle is condescending at times, bland in others, and overall forgettable. **spoiler** When Gaston and the beast have their fight,in this version instead of stabbing beast in the back, Gaston shoots him unexpectedly like twice. Emma Watson's "reaction" to this is a prime example of my above complaints. She doesn't seem shocked, sad, NOTHING. She waits until he's been shot a few times and has been down awhile before changing emotion at all and even then her "sorrow" at his death is horribly unbelievable. I could not believe this was Emma Watson preforming in this way. The beast was eh, Lafou wasn't funny (the theater was at no point filled with laughter. My 10 year old laughed twice the whole time), and the servants weren't charming or at all like their cartoon versions. I also hate that the funny back and forth between Lumiere and Cogsworth wasn't there. The only one I enjoyed was Luke Evans as Gaston. He was far from perfect but I think he did best out of everyone. 
As I scroll through the IMDb reviews with the occasional 8 or 9, and pages of 2's and 4's, I can't understand how the rating is a 7.8. I give it a 2 for effort and can say with 100% certainty that I won't ever sit through it again. Another pointless remake. Disappointing
'''

In [61]:
input_seq = token.texts_to_sequences([input_text])

In [62]:
print(input_seq[0])

[9, 240, 3, 662, 333, 4, 1, 200, 2, 9, 12, 50, 1, 173, 12, 142, 3, 190, 333, 4, 2637, 2, 87, 4, 1, 81, 692, 152, 34, 9, 431, 7, 15, 308, 1905, 14, 10, 8, 12, 369, 1, 1679, 2, 67, 2, 857, 220, 19, 1, 871, 4, 3191, 134, 118, 32, 2637, 78, 37, 201, 50, 55, 691, 281, 52, 5, 1, 2781, 446, 81, 1390, 3715, 2, 25, 259, 3463, 1579, 1030, 595, 4, 1, 1375, 38, 4, 1, 200, 1, 350, 67, 155, 1828, 14, 68, 59, 3660, 57, 326, 1065, 1, 671, 10, 1029, 52, 9, 187, 33, 1, 2459, 4, 2, 2637, 32, 67, 75, 2637, 235, 12, 20, 47, 9, 868, 35, 37, 8, 12, 36, 55, 12, 265, 17, 1008, 1, 947, 54, 2, 170, 37, 6, 29, 207, 1914, 7, 404, 2, 442, 2442, 1379, 50, 2, 1, 2772, 24, 64, 544, 7, 10, 306, 300, 4, 2772, 7, 1, 141, 3239, 86, 36, 1447, 2637, 2087, 5, 10, 6, 3, 2478, 456, 4, 57, 748, 55, 148, 302, 2412, 614, 159, 55, 362, 236, 73, 320, 3, 167, 207, 2, 43, 73, 176, 154, 2535, 1421, 29, 28, 2, 56, 91, 37, 29, 23, 337, 6, 2356, 1296, 9, 96, 20, 260, 10, 12, 2637, 7, 10, 92, 1, 2772, 12, 281, 158, 1, 747, 12, 29, 53, 209

In [63]:
len(input_seq[0])

338

In [64]:
pad_input_seq = sequence.pad_sequences(input_seq, maxlen=380)

In [65]:
len(pad_input_seq[0])

380

In [66]:
predict_result=model.predict_classes(pad_input_seq)

In [67]:
predict_result

array([[1]])

In [68]:
predict_result[0][0]

1

In [69]:
SentimentDict[predict_result[0][0]]

'正面的'

In [70]:
def predict_review(input_text):
    input_seq = token.texts_to_sequences([input_text])
    pad_input_seq = sequence.pad_sequences(input_seq, maxlen=380)
    predict_result=model.predict_classes(pad_input_seq)
    print(SentimentDict[predict_result[0][0]])

In [71]:
predict_review('''
THIS MAY CONTAIN SPOILERS 

It is really simple. 

If they wanted to make a good movie, a really good movie they would take care of two things. 1. Belle. They would cast an actor who can actually ...act.

(After watching the movie it is pretty obvious that the only reason Watson landed the role is her pretty big fan base of teenagers from her Harry Potter films)

What we needed for Belle's role? 

Fresh, energetic, with a smile that lights her face actress. 

What we got? 

The twin sister of Bella Swan that landed herself in medieval France. Seriously I was waiting for Edward Cullen to do a cameo from time to time. 

Haven't watched miss Watson in Harry Potter films but this was just a disaster. 

2. The Beast. 

It was so.... distressing to watch a soulless face throughout the movie talking. Even at the peak of the Beast's story when he lets Belle go, and we suppose to see the agony, the pain, all I get is.... nothing. Soulless. 

Flat. 

Disappointing.

I am confused. Should the actor still work harder and deliver his role as a beast better or it is due that horrible CGI head that we can't appreciate his performance? 

Either way, again, they could do a lot better. 

To sum this up, soulless Beast x miscast Belle leads to this awful remake to a classic animated film that we all grew up to love. 

Save your money and go watch something else.''')

負面的
