In [1]:
import numpy as np # linear algebra
from sklearn.model_selection import train_test_split
import re
from time import time

from keras.callbacks import TensorBoard
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential, model_from_json
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Flatten
from keras.utils.np_utils import to_categorical
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

Using TensorFlow backend.


In [2]:
# load preprocessed data.
wordsList = np.load('wordIndex_all.npy')
wordsList = wordsList.tolist() #Originally loaded as numpy array

#ids_p = np.load('idsMatrix_pos.npy')
#ids_n = np.load('idsMatrix_neg.npy')

ids = np.load('idsMatrix_all.npy')

# Reverse from integers to words using the DICTIONARY
reverse_word_index = dict(
[(value, key) for (key, value) in wordsList.items()])


X = ids[:,:250]   #word index
y = ids[:,250:]   #carwgory label

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [3]:
# decode word ids to text for a given pretrained id sequence
decoded_review = ' '.join(
[reverse_word_index.get(i, '?') for i in ids[300]])

print(decoded_review)

this a good episode of the new twilight zone that actually includes interesting ideas and clever stories i note both of them are based on short stories examination day is set in the future year unknown but at a point where they have cake candles that light themselves huge tv looking phones that double as numerous other entertaining machines and distributed only to those of a certain age and the examination day a point where 12 year olds must undergo a government required iq test the kid is this story dickie jordan david UNK is just celebrating his own 12th birthday and is a smart kid so is calm even eager to take the test that he has seen friends pass easily and knows he will excel at based on his school grades his parents christopher UNK and elizabeth UNK on the other hand say he shouldn't have used his birthday wish on getting a good score and while their reason includes that they believe he's capable and he should have no need to worry it's pretty obvious they are worried i won't gi

In [4]:
# Define network parameters

MAX_NUM_WORDS = 40000 # only use top n words
numDimensions = 50 #Dimensions for each word vector
maxSeqLength = X.shape[1] #Maximum number of words for each document

batchSize = 24
lstmUnits = 64
numClasses = 2
iterations = 100000

# create the Multi-Layer Perceptron Models

In [5]:
# create the Multi-Layer Perceptron Model for binary classification
model = Sequential()
model.add(Embedding(MAX_NUM_WORDS, numDimensions, input_length = maxSeqLength))
model.add(Flatten())
model.add(Dense(lstmUnits, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Fit the Multi-Layer Perceptron Model
model.fit(X_train, y_train[:,0], validation_data=(X_test, y_test[:,0]), epochs=2, batch_size=128, verbose=2)
# Final evaluation of the model
loss, acc = model.evaluate(X_test, y_test[:,0], verbose=0)
print("Accuracy: %.2f%%" % (acc*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 250, 50)           2000000   
_________________________________________________________________
flatten_1 (Flatten)          (None, 12500)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                800064    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 2,800,129
Trainable params: 2,800,129
Non-trainable params: 0
_________________________________________________________________
None
Train on 16750 samples, validate on 8250 samples
Epoch 1/2
 - 4s - loss: 0.5374 - acc: 0.7066 - val_loss: 0.3551 - val_acc: 0.8410
Epoch 2/2
 - 3s - loss: 0.1341 - acc: 0.9533 - val_loss: 0.3457 - val_acc: 0.8553
Accuracy: 85.53%


In [6]:
# create the Multi-Layer Perceptron Model for two-classes classification
model = Sequential()
model.add(Embedding(MAX_NUM_WORDS, numDimensions, input_length = maxSeqLength))
model.add(Flatten())
model.add(Dense(lstmUnits, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

# Fit the Multi-Layer Perceptron Model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=128, verbose=2)
# Final evaluation of the model
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (acc*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 250, 50)           2000000   
_________________________________________________________________
flatten_2 (Flatten)          (None, 12500)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 64)                800064    
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 130       
Total params: 2,800,194
Trainable params: 2,800,194
Non-trainable params: 0
_________________________________________________________________
None
Train on 16750 samples, validate on 8250 samples
Epoch 1/10
 - 3s - loss: 0.5495 - acc: 0.6947 - val_loss: 0.3533 - val_acc: 0.8464
Epoch 2/10
 - 3s - loss: 0.1298 - acc: 0.9543 - val_loss: 0.3650 - val_acc: 0.8543
Epoch 3/10
 - 3s - loss: 0.0134 - acc: 0.998

# One-Dimensional Convolutional Neural Network Model for the IMDB Dataset

In [None]:
# create the CNN 1D model
model = Sequential()
model.add(Embedding(MAX_NUM_WORDS, numDimensions, input_length=maxSeqLength))
model.add(Conv1D(filters=numDimensions, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
model.fit(X_train, y_train[:,0], validation_data=(X_test, y_test[:,0]), epochs=2, batch_size=128, verbose=2)
# Final evaluation of the model
loss, acc = model.evaluate(X_test, y_test[:,0], verbose=0)
print("Accuracy: %.2f%%" % (acc*100))

# Build the LSTM model

In [7]:
model = Sequential() 
model.add(Embedding(MAX_NUM_WORDS, numDimensions, input_length=maxSeqLength))  
# number of weights of embeeding is MAX_NUM_WORDS*numDimensions=40000*50=2000000
model.add(LSTM(lstmUnits, recurrent_dropout=0.25)) 
# number of weights of LSTM is (lstmUnits*(numDimensions+lstmUnits) + lstmUnits)*4(number of gates)
# = (64*(50+64)+64)*4=29940
model.add(Dense(2, activation='softmax'))
# number of weights of Classifer is (lstmUnits*2+2(numofClasses)=64*2+2=130
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary()) 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 250, 50)           2000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                29440     
_________________________________________________________________
dense_5 (Dense)              (None, 2)                 130       
Total params: 2,029,570
Trainable params: 2,029,570
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# define callback list

tensorboard = TensorBoard(log_dir="logs/{}".format(time()))

# checkpoint
filepath="weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

callbacks_list = [checkpoint, tensorboard]

In [None]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=128, callbacks=callbacks_list, verbose=2)
# Final evaluation of the model
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (acc*100))

# Save/Load model and weights

In [2]:
from keras.models import load_model


# returns a compiled model
# identical to the previous one
model = load_model('weights-improvement-86-0.83.hdf5')

Using TensorFlow backend.


In [3]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 250, 50)           2000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                29440     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 130       
Total params: 2,029,570
Trainable params: 2,029,570
Non-trainable params: 0
_________________________________________________________________


In [None]:
# access the embedding layer through the constructed model 
# first `0` refers to the position of embedding layer in the `model`
embeddings = model.layers[0].get_weights()[0]


In [None]:
# apply loaded model to test data
loss, acc = model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (model.metrics_names[1], acc*100))

In [None]:
# save model from current model instance 
# serialize model to JSON
model_json = model.to_json()
with open("IMDB_Sentiment_model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("IMDB_Sentiment_weights.h5")
print("Saved model to disk")
 loss, acc = model.evaluate(X_test, y_test, verbose=0)loss, acc = model.evaluate(X_test, y_test, verbose=0)

# load json and create model
json_file = open('IMDB_Sentiment_model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()

loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.loadprint("%s: %.2f%%" % (model.metrics_names[1], acc*100))_weights("IMDB_Sentiment_weights.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
loaded_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
loss, acc = loaded_model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], acc*100))

# Predict input doc with pre-trained model

In [None]:
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

maxSeqLength = 250 #Maximum number of words for each document

#inputText = "That movie was terrible."
inputText = "That movie was the best one I have ever seen."

# split to words
words = text_to_word_sequence(inputText, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ')

#convert word sequence to ids
sentenceMatrix = np.zeros([1,maxSeqLength], dtype='int32')
for indexCounter,word in enumerate(words):
        try:
            sentenceMatrix[0][indexCounter] = wordsList[word]
        except KeyError:
            sentenceMatrix[0][indexCounter] = 1 #ID for unkown words "UNK"

# predict sentiment scores for input text
predictedSentiment = model.predict(sentenceMatrix)

print(predictedSentiment) 