# KHOA LUAN VIETSTOCK
Huynh Duc Huy

In [1]:
from __future__ import print_function
import time
import numpy as np
np.random.seed(1337)  # for reproducibility

from keras.preprocessing import sequence
from keras.models import Model, Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Input, merge, BatchNormalization,GRU
from keras.datasets import imdb

import os
from keras.preprocessing.text import Tokenizer

Using Theano backend.


In [2]:
max_features =40000
max_len = 500  # cut texts after this number of words (among top max_features most common words)

In [3]:
# get dataset and unzip: http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

X_train = []
y_train = []

path = 'C:/1_Research/Create_data/aclImdb/newest/vietstock_27_12/train/pos/'
X_train.extend([open(path + f).read() for f in os.listdir(path) if f.endswith('.txt')])
y_train.extend([1 for _ in range(1325)])

path = 'C:/1_Research/Create_data/aclImdb/newest/vietstock_27_12/train/neg/'
X_train.extend([open(path + f).read() for f in os.listdir(path) if f.endswith('.txt')])
y_train.extend([0 for _ in range(390)])


In [4]:
# read in the test data

X_test = []
y_test = []

path = 'C:/1_Research/Create_data/aclImdb/newest/vietstock_27_12/test/pos/'
X_test.extend([open(path + f).read() for f in os.listdir(path) if f.endswith('.txt')])
y_test.extend([1 for _ in range(614)])

path = 'C:/1_Research/Create_data/aclImdb/newest/vietstock_27_12/test/neg/'
X_test.extend([open(path + f).read() for f in os.listdir(path) if f.endswith('.txt')])
y_test.extend([0 for _ in range(142)])


In [5]:
#tokenize works to list of integers where each integer is a key to a word
imdbTokenizer = Tokenizer(nb_words=max_features)

imdbTokenizer.fit_on_texts(X_train)

In [6]:
#print top 20 words 
#note zero is reserved for non frequent words
#for word, value in imdbTokenizer.word_index.items():
   # if value < 20:
       # print(value, word)

In [7]:
#create int to word dictionary
intToWord = {}
for word, value in imdbTokenizer.word_index.items():
    intToWord[value] = word

#add a symbol for null placeholder
intToWord[0] = "!!!NA!!!"
    
#print(intToWord[1])
#print(intToWord[2])
#print(intToWord[32])

In [8]:
#convert word strings to integer sequence lists
#print(X_train[0])
#print(imdbTokenizer.texts_to_sequences(X_train[:1]))
#for value in imdbTokenizer.texts_to_sequences(X_train[:1])[0]:
    #print(intToWord[value])
    
X_train = imdbTokenizer.texts_to_sequences(X_train)
X_test = imdbTokenizer.texts_to_sequences(X_test)

In [9]:
# Censor the data by having a max review length (in number of words)

#use this function to load data from keras pickle instead of munging as shown above
#(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features,
#                                                      test_split=0.2)

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print("Pad sequences (samples x time)")
X_train = sequence.pad_sequences(X_train, maxlen=max_len)
X_test = sequence.pad_sequences(X_test, maxlen=max_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
y_train = np.array(y_train)
y_test = np.array(y_test)

1715 train sequences
756 test sequences
Pad sequences (samples x time)
X_train shape: (1715L, 500L)
X_test shape: (756L, 500L)


In [10]:
#example of a sentence sequence, note that lower integers are words that occur more commonly
#print("x:", X_train[0]) #per observation vector of 20000 words
#print("y:", y_train[0]) #positive or negative review encoding

In [11]:
# double check that word sequences behave/final dimensions are as expected
#print("y distribution:", np.unique(y_train, return_counts=True))
#print("max x word:", np.max(X_train), "; min x word", np.min(X_train))
#print("y distribution test:", np.unique(y_test, return_counts=True))
#print("max x word test:", np.max(X_test), "; min x word", np.min(X_test))

In [12]:
#print("most and least popular words: ")
#print(np.unique(X_train, return_counts=True))
# as expected zero is the highly used word for words not in index

In [13]:
#set model hyper parameters
epochs = 100
embedding_neurons = 64
lstm_neurons = 128
batch_size =32

In [14]:
# Forward Pass LSTM Network

# this is the placeholder tensor for the input sequences
sequence = Input(shape=(max_len,), dtype='int32')
# this embedding layer will transform the sequences of integers
# into vectors of size embedding
# embedding layer converts dense int input to one-hot in real time to save memory
embedded = Embedding(max_features, embedding_neurons, input_length=max_len)(sequence)
# normalize embeddings by input/word in sentence
bnorm = BatchNormalization()(embedded)

# apply forwards LSTM layer size lstm_neurons
forwards = GRU(lstm_neurons, dropout_W=0.2, dropout_U=0.2)(bnorm)

# dropout 
after_dp = Dropout(0.3)(forwards)
output = Dense(1, activation='sigmoid')(after_dp)

model_fdir_atom = Model(input=sequence, output=output)
# review model structure
print(model_fdir_atom.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 500)           0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 500, 64)       2560000     input_1[0][0]                    
____________________________________________________________________________________________________
batchnormalization_1 (BatchNormal(None, 500, 64)       128         embedding_1[0][0]                
____________________________________________________________________________________________________
gru_1 (GRU)                      (None, 128)           74112       batchnormalization_1[0][0]       
___________________________________________________________________________________________

In [15]:
# Forward pass LSTM network

# try using different optimizers and different optimizer configs
model_fdir_atom.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
start_time = time.time()

history_fdir_atom = model_fdir_atom.fit(X_train, y_train,
                    batch_size=batch_size,
                    nb_epoch=epochs,
                    validation_data=[X_test, y_test], 
                    verbose=2)

end_time = time.time()
average_time_per_epoch = (end_time - start_time) / epochs
print("avg sec per epoch:", average_time_per_epoch)

Train...
Train on 1715 samples, validate on 756 samples
Epoch 1/100
76s - loss: 0.6046 - acc: 0.7201 - val_loss: 0.5150 - val_acc: 0.8108
Epoch 2/100
87s - loss: 0.4731 - acc: 0.7860 - val_loss: 0.5403 - val_acc: 0.7817
Epoch 3/100
85s - loss: 0.4031 - acc: 0.8227 - val_loss: 0.6227 - val_acc: 0.7513
Epoch 4/100
83s - loss: 0.3231 - acc: 0.8676 - val_loss: 0.7222 - val_acc: 0.6865
Epoch 5/100
78s - loss: 0.2438 - acc: 0.8991 - val_loss: 0.8492 - val_acc: 0.7024
Epoch 6/100
82s - loss: 0.1829 - acc: 0.9259 - val_loss: 0.9780 - val_acc: 0.6825
Epoch 7/100
79s - loss: 0.1422 - acc: 0.9522 - val_loss: 1.0055 - val_acc: 0.6997
Epoch 8/100
78s - loss: 0.1171 - acc: 0.9510 - val_loss: 1.1692 - val_acc: 0.6918
Epoch 9/100
77s - loss: 0.1086 - acc: 0.9609 - val_loss: 1.2282 - val_acc: 0.7249
Epoch 10/100
78s - loss: 0.0813 - acc: 0.9697 - val_loss: 1.4505 - val_acc: 0.6839
Epoch 11/100
76s - loss: 0.0688 - acc: 0.9761 - val_loss: 1.4280 - val_acc: 0.6561
Epoch 12/100
69s - loss: 0.0565 - acc: 0

In [16]:
# Bi-directional Atom

# based on keras tutorial: https://github.com/fchollet/keras/blob/master/examples/imdb_bidirectional_lstm.py

# this is the placeholder tensor for the input sequences
sequence = Input(shape=(max_len,), dtype='int32')
# this embedding layer will transform the sequences of integers
# into vectors of size embedding
# embedding layer converts dense int input to one-hot in real time to save memory
embedded = Embedding(max_features, embedding_neurons, input_length=max_len)(sequence)
# normalize embeddings by input/word in sentence
bnorm = BatchNormalization()(embedded)

# apply forwards LSTM layer size lstm_neurons
forwards = GRU(lstm_neurons, dropout_W=0.4, dropout_U=0.4)(bnorm)
# apply backwards LSTM
backwards = GRU(lstm_neurons, dropout_W=0.4, dropout_U=0.4, go_backwards=True)(bnorm)

# concatenate the outputs of the 2 LSTMs
merged = merge([forwards, backwards], mode='concat', concat_axis=-1)
after_dp = Dropout(0.5)(merged)
output = Dense(1, activation='sigmoid')(after_dp)

model_bidir_atom = Model(input=sequence, output=output)
# review model structure
print(model_bidir_atom.summary())

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_2 (InputLayer)             (None, 500)           0                                            
____________________________________________________________________________________________________
embedding_2 (Embedding)          (None, 500, 64)       2560000     input_2[0][0]                    
____________________________________________________________________________________________________
batchnormalization_2 (BatchNormal(None, 500, 64)       128         embedding_2[0][0]                
____________________________________________________________________________________________________
gru_2 (GRU)                      (None, 128)           74112       batchnormalization_2[0][0]       
___________________________________________________________________________________________

In [17]:
# Bi-directional Atom

# try using different optimizers and different optimizer configs
model_bidir_atom.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
start_time = time.time()

history_bidir_atom = model_bidir_atom.fit(X_train, y_train,
                    batch_size=batch_size,
                    nb_epoch=epochs,
                    validation_data=[X_test, y_test], 
                    verbose=2)

end_time = time.time()
average_time_per_epoch = (end_time - start_time) / epochs
print("avg sec per epoch:", average_time_per_epoch)

Train...
Train on 1715 samples, validate on 756 samples
Epoch 1/100
158s - loss: 0.6503 - acc: 0.6752 - val_loss: 0.5085 - val_acc: 0.8095
Epoch 2/100
155s - loss: 0.5292 - acc: 0.7638 - val_loss: 0.5130 - val_acc: 0.8042
Epoch 3/100
152s - loss: 0.4763 - acc: 0.7953 - val_loss: 0.5300 - val_acc: 0.7989
Epoch 4/100
150s - loss: 0.4170 - acc: 0.8128 - val_loss: 0.5607 - val_acc: 0.7817
Epoch 5/100
157s - loss: 0.3708 - acc: 0.8402 - val_loss: 0.6080 - val_acc: 0.7659
Epoch 6/100
166s - loss: 0.3053 - acc: 0.8682 - val_loss: 0.6916 - val_acc: 0.7500
Epoch 7/100
161s - loss: 0.2700 - acc: 0.8915 - val_loss: 0.7667 - val_acc: 0.7381
Epoch 8/100
160s - loss: 0.2269 - acc: 0.9079 - val_loss: 0.8384 - val_acc: 0.7116
Epoch 9/100
166s - loss: 0.1886 - acc: 0.9201 - val_loss: 0.9028 - val_acc: 0.7037
Epoch 10/100
160s - loss: 0.1690 - acc: 0.9347 - val_loss: 0.9270 - val_acc: 0.6839
Epoch 11/100
157s - loss: 0.1411 - acc: 0.9446 - val_loss: 1.0356 - val_acc: 0.7050
Epoch 12/100
160s - loss: 0.1