In [48]:
#Download the database
import urllib.request
import os 
import tarfile

url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
filepath="data/aclImdb_v1.tar.gz"
if not os.path.isfile(filepath):
    result = urllib.request.urlretrieve(url,filepath)
    print('download:', result)
    
if not os.path.exists("data/aclImb"):
    tfile = tarfile.open("data/aclImdb_v1.tar.gz",'r:gz')
    result = tfile.extractall('data/') #解壓縮
        


In [49]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
import re

#remove HTML label
def rm_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('',text)


#loading IMDb files
import os
def read_files(filetype):
    path = "data/aclImdb/"
    file_list=[]
    positive_path = path+filetype+"/pos/"
    for f in os.listdir(positive_path):
        file_list+=[positive_path+f]
        
    negative_path = path+filetype+"/neg/"
    for f in os.listdir(negative_path):
        file_list+=[negative_path+f]   
    
    print('read',filetype, 'files', len(file_list))
    all_labels = ([1]*12500+[0]*12500)
    all_texts  = []
    for fi in file_list:
        with open(fi, encoding='utf8') as file_input:
            all_texts += [rm_tags(" ".join(file_input.readlines()))] #conect file after remove HTNL 
            
    return all_labels,all_texts


In [50]:
y_train, train_text = read_files("train")
y_test , test_text  = read_files("test")

read train files 25000
read test files 25000


In [51]:
#check
print('y_train[0] is pos or neg = ',y_train[0])
print('train_text[0]:')
train_text[0]

print('y_train[12501] is pos or neg = ',y_train[12501])
print('train_text[12501]:')
train_text[12501]



y_train[0] is pos or neg =  1
train_text[0]:
y_train[12501] is pos or neg =  0
train_text[12501]:


"Airport '77 starts as a brand new luxury 747 plane is loaded up with valuable paintings & such belonging to rich businessman Philip Stevens (James Stewart) who is flying them & a bunch of VIP's to his estate in preparation of it being opened to the public as a museum, also on board is Stevens daughter Julie (Kathleen Quinlan) & her son. The luxury jetliner takes off as planned but mid-air the plane is hi-jacked by the co-pilot Chambers (Robert Foxworth) & his two accomplice's Banker (Monte Markham) & Wilson (Michael Pataki) who knock the passengers & crew out with sleeping gas, they plan to steal the valuable cargo & land on a disused plane strip on an isolated island but while making his descent Chambers almost hits an oil rig in the Ocean & loses control of the plane sending it crashing into the sea where it sinks to the bottom right bang in the middle of the Bermuda Triangle. With air in short supply, water leaking in & having flown over 200 miles off course the problems mount for 

In [52]:
#build token
token = Tokenizer(num_words = 2000)
token.fit_on_texts(train_text)

#print(token.document_count)
#print(token.word_index)

#token
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq  = token.texts_to_sequences(test_text)

#print(train_text[0])
#print(x_train_seq[0])

#same length
x_train = sequence.pad_sequences(x_train_seq, maxlen=100)
x_test  = sequence.pad_sequences(x_test_seq , maxlen=100)

print('before pad_sequences length = ', len(x_train_seq[0]))
print(x_train_seq[0])


before pad_sequences length =  106
[308, 6, 3, 1068, 208, 8, 29, 1, 168, 54, 13, 45, 81, 40, 391, 109, 137, 13, 57, 149, 7, 1, 481, 68, 5, 260, 11, 6, 72, 5, 631, 70, 6, 1, 5, 1, 1530, 33, 66, 63, 204, 139, 64, 1229, 1, 4, 1, 222, 899, 28, 68, 4, 1, 9, 693, 2, 64, 1530, 50, 9, 215, 1, 386, 7, 59, 3, 1470, 798, 5, 176, 1, 391, 9, 1235, 29, 308, 3, 352, 343, 142, 129, 5, 27, 4, 125, 1470, 5, 308, 9, 532, 11, 107, 1466, 4, 57, 554, 100, 11, 308, 6, 226, 47, 3, 11, 8, 214]


In [53]:
#model
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding

model = Sequential()
model.add(Embedding(output_dim=32,input_dim=2000,input_length=100))
model.add(Dropout(0.2))

model.add(Flatten())
model.add(Dense(units=256,activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(units=1,activation='sigmoid'))
#model.summary()

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
train_history = model.fit(x_train,y_train,batch_size=100,epochs=10,verbose=2,validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
4s - loss: 0.4773 - acc: 0.7558 - val_loss: 0.4363 - val_acc: 0.8006
Epoch 2/10
4s - loss: 0.2676 - acc: 0.8913 - val_loss: 0.4232 - val_acc: 0.8148
Epoch 3/10
4s - loss: 0.1662 - acc: 0.9379 - val_loss: 0.7306 - val_acc: 0.7294
Epoch 4/10
4s - loss: 0.0848 - acc: 0.9701 - val_loss: 0.7728 - val_acc: 0.7614
Epoch 5/10
4s - loss: 0.0532 - acc: 0.9813 - val_loss: 0.9236 - val_acc: 0.7596
Epoch 6/10
4s - loss: 0.0380 - acc: 0.9857 - val_loss: 1.0134 - val_acc: 0.7662
Epoch 7/10
4s - loss: 0.0301 - acc: 0.9890 - val_loss: 1.1382 - val_acc: 0.7700
Epoch 8/10
4s - loss: 0.0310 - acc: 0.9882 - val_loss: 1.1488 - val_acc: 0.7700
Epoch 9/10
4s - loss: 0.0302 - acc: 0.9893 - val_loss: 1.2833 - val_acc: 0.7440
Epoch 10/10
4s - loss: 0.0236 - acc: 0.9923 - val_loss: 1.2223 - val_acc: 0.7630


In [54]:
#scores & predict
scores = model.evaluate(x_test,y_test,verbose=1)
print('the scores is : ',scores[1])

predict = model.predict_classes(x_test)
#predict[:10]

predict_classes = predict.reshape(-1)
#predict_classes[:10]




In [55]:
#數值轉換文字- function
SentimentDict={1:'正面的',0:'負面的'}
def display_test_sentiment(i):
    print(test_text[i])
    print('label真實值:', SentimentDict[y_test[i]],'預測結果:',SentimentDict[predict_classes[i]])

In [56]:
#顯示測試的評論
display_test_sentiment(2)

As a recreational golfer with some knowledge of the sport's history, I was pleased with Disney's sensitivity to the issues of class in golf in the early twentieth century. The movie depicted well the psychological battles that Harry Vardon fought within himself, from his childhood trauma of being evicted to his own inability to break that glass ceiling that prevents him from being accepted as an equal in English golf society. Likewise, the young Ouimet goes through his own class struggles, being a mere caddie in the eyes of the upper crust Americans who scoff at his attempts to rise above his standing. What I loved best, however, is how this theme of class is manifested in the characters of Ouimet's parents. His father is a working-class drone who sees the value of hard work but is intimidated by the upper class; his mother, however, recognizes her son's talent and desire and encourages him to pursue his dream of competing against those who think he is inferior.Finally, the golf scenes

In [57]:
#預測影評

#build function
def predict_review(input_text):
    input_seq= token.texts_to_sequences([input_text]) #text to list
    pad_input_seq = sequence.pad_sequences(input_seq, maxlen=100) #limit text number
    predict_result = model.predict_classes([pad_input_seq])#strat predict
    print(' 此影評為',SentimentDict[predict_result[0][0]])#
    
    #print('input_text')
    #print(input_seq[0])
    #print('input_text length = ', len(input_text ),'\n' )
    
#predict new text from web
input_text = 'I loved this Movie I love The Cast I love Emma Watson I Love The Movie A Lot Beauty And Beast Movie Will Be In My Favorite List Always I Love the Movie Very Much In The Movie Emma Watson Was Looking Very beautiful Her character was So good In the film i am always a greatest fan of Emma Watson I love All the movie in which she act She always acting very beautifully.'
predict_review(input_text)


 此影評為 正面的


In [60]:


#special RNN MODEL for times
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN
model = Sequential()
model.add(Embedding(output_dim=32,input_dim=2000,input_length=100))
model.add(Dropout(0.35))
model.add(SimpleRNN(units=16))
model.add(Dense(units=256,activation='relu'))
model.add(Dropout(0.35))
model.add(Dense(units=1,activation='sigmoid'))
model.summary()
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
train_history = model.fit(x_train,y_train,batch_size=100,epochs=10,verbose=2,validation_split=0.2)
scores = model.evaluate(x_test,y_test,verbose=1)
scores[1]

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 100, 32)           64000     
_________________________________________________________________
dropout_27 (Dropout)         (None, 100, 32)           0         
_________________________________________________________________
simple_rnn_7 (SimpleRNN)     (None, 16)                784       
_________________________________________________________________
dense_27 (Dense)             (None, 256)               4352      
_________________________________________________________________
dropout_28 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_28 (Dense)             (None, 1)                 257       
Total params: 69,393
Trainable params: 69,393
Non-trainable params: 0
_________________________________________________________________
Train 

0.81976000000000004

In [61]:

#special LSTM MODEL for times
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
model = Sequential()
model.add(Embedding(output_dim=32,input_dim=2000,input_length=100))
model.add(Dropout(0.2))
model.add(LSTM(32))
model.add(Dense(units=256,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(units=1,activation='sigmoid'))
model.summary()
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
train_history = model.fit(x_train,y_train,batch_size=100,epochs=10,verbose=2,validation_split=0.2)
scores = model.evaluate(x_test,y_test,verbose=1)
scores[1]

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 100, 32)           64000     
_________________________________________________________________
dropout_29 (Dropout)         (None, 100, 32)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_29 (Dense)             (None, 256)               8448      
_________________________________________________________________
dropout_30 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_30 (Dense)             (None, 1)                 257       
Total params: 81,025
Trainable params: 81,025
Non-trainable params: 0
_________________________________________________________________
Train 

0.82264000000000004