In [None]:
# * Outlook for multiple classes: for multi-class prediction use softmax but for multi-label prediction use sigmoid
# * Use aws deep learning instance: https://docs.aws.amazon.com/dlami/latest/devguide/keras-mxnet.html
# * Use t-SNE to visualize the word embeddings
# * Similarity search by using learned embeddings, i.e. 
#   https://blog.insightdatascience.com/the-unreasonable-effectiveness-of-deep-learning-representations-4ce83fc663cf
#   (especially Spotify Annoy Index)

In [None]:
from keras.datasets import imdb
from keras.preprocessing import sequence 
from keras.layers import LSTM, Embedding, Dense, Flatten, Bidirectional
from keras.models import Sequential
from keras.utils import get_file
from keras.initializers import Constant
import numpy as np

MAX_NUM_WORDS = 5000
MAX_SEQUENCE_LEN = 500

In [None]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=MAX_NUM_WORDS)

In [None]:
INDEX_FROM = 3 # since 0 = padding, 1 = start, 2 = unkown
word_to_id = imdb.get_word_index()
word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2
id_to_word = {value:key for key,value in word_to_id.items()}
print(' '.join(id_to_word[id] for id in X_train[2] ))

In [None]:
X_train = sequence.pad_sequences(X_train, maxlen=MAX_SEQUENCE_LEN) 
X_test = sequence.pad_sequences(X_test, maxlen=MAX_SEQUENCE_LEN) 

In [26]:
# get glove coeff matrix
embeddings_index = {}
fname = get_file("glove.6B.100d.txt", "http://nlp.stanford.edu/data/glove.6B.zip", extract=True)
with open(fname, encoding="utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print('Found %s word vectors.' % len(embeddings_index))

# prepare pre-learned embedding matrix
embdedding_dim = 100
word_index = imdb.get_word_index()
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, embdedding_dim))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
glove_embedding = Embedding(num_words, embdedding_dim, 
                            input_length=MAX_SEQUENCE_LEN,
                            embeddings_initializer=Constant(embedding_matrix), trainable=True)

# test embedding model
def cosine_sim(v1, v2):
    return np.dot(v1, v2) / np.sqrt(np.dot(v1, v1)) / np.sqrt(np.dot(v2, v2))
print(cosine_sim(embeddings_index["apple"], embeddings_index["metal"]))
print(cosine_sim(embeddings_index["apple"], embeddings_index["apple"]))
print(cosine_sim(embeddings_index["king"], embeddings_index["queen"]))
print(cosine_sim(embeddings_index["king"]-embeddings_index["man"]+embeddings_index["woman"], 
                 embeddings_index["queen"]))

Found 400000 word vectors.
0.20312092
1.0
0.7507691
0.78344136


In [27]:
USE_LSTM = False
USE_GLOVE = True # trainable=True

# Build the model 
model = Sequential()

if USE_GLOVE:
    model.add(glove_embedding)
else:
    embedding_vector_length = 32
    model.add(Embedding(MAX_NUM_WORDS, embedding_vector_length, input_length=MAX_SEQUENCE_LEN)) 

if USE_LSTM:
    model.add(Bidirectional(LSTM(32, return_sequences=True, dropout=0.1)))
    model.add(Bidirectional(LSTM(32, dropout=0.1)))
else:
    model.add(Flatten())
    model.add(Dense(250, activation='relu'))
    
model.add(Dense(1, activation='sigmoid')) 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
print(model.summary()) 

model.fit(X_train, y_train, epochs=3, batch_size=32, validation_split=0.1)
model.evaluate(X_test, y_test)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 500, 100)          500100    
_________________________________________________________________
flatten_3 (Flatten)          (None, 50000)             0         
_________________________________________________________________
dense_9 (Dense)              (None, 250)               12500250  
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 251       
Total params: 13,000,601
Trainable params: 13,000,601
Non-trainable params: 0
_________________________________________________________________
None
Train on 22500 samples, validate on 2500 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


[0.4509590995502472, 0.8316]

In [25]:
USE_LSTM = False
USE_GLOVE = True # trainable=False

# Build the model 
model = Sequential()

if USE_GLOVE:
    model.add(glove_embedding)
else:
    embedding_vector_length = 32
    model.add(Embedding(MAX_NUM_WORDS, embedding_vector_length, input_length=MAX_SEQUENCE_LEN)) 

if USE_LSTM:
    model.add(Bidirectional(LSTM(32, return_sequences=True, dropout=0.1)))
    model.add(Bidirectional(LSTM(32, dropout=0.1)))
else:
    model.add(Flatten())
    model.add(Dense(250, activation='relu'))
    
model.add(Dense(1, activation='sigmoid')) 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
print(model.summary()) 

model.fit(X_train, y_train, epochs=3, batch_size=32, validation_split=0.1)
model.evaluate(X_test, y_test)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 500, 100)          500100    
_________________________________________________________________
flatten_2 (Flatten)          (None, 50000)             0         
_________________________________________________________________
dense_7 (Dense)              (None, 250)               12500250  
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 251       
Total params: 13,000,601
Trainable params: 12,500,501
Non-trainable params: 500,100
_________________________________________________________________
None
Train on 22500 samples, validate on 2500 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


[0.8002648965454101, 0.5774]

In [24]:
USE_LSTM = False
USE_GLOVE = False

# Build the model 
model = Sequential()

if USE_GLOVE:
    model.add(glove_embedding)
else:
    embedding_vector_length = 32
    model.add(Embedding(MAX_NUM_WORDS, embedding_vector_length, input_length=MAX_SEQUENCE_LEN)) 

if USE_LSTM:
    model.add(Bidirectional(LSTM(32, return_sequences=True, dropout=0.1)))
    model.add(Bidirectional(LSTM(32, dropout=0.1)))
else:
    model.add(Flatten())
    model.add(Dense(250, activation='relu'))
    
model.add(Dense(1, activation='sigmoid')) 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
print(model.summary()) 

model.fit(X_train, y_train, epochs=3, batch_size=32, validation_split=0.1)
model.evaluate(X_test, y_test)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 16000)             0         
_________________________________________________________________
dense_5 (Dense)              (None, 250)               4000250   
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 251       
Total params: 4,160,501
Trainable params: 4,160,501
Non-trainable params: 0
_________________________________________________________________
None
Train on 22500 samples, validate on 2500 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


[0.5609432465255261, 0.85392]

In [22]:
USE_LSTM = True
USE_GLOVE = False

# Build the model 
model = Sequential()

if USE_GLOVE:
    model.add(glove_embedding)
else:
    embedding_vector_length = 32
    model.add(Embedding(MAX_NUM_WORDS, embedding_vector_length, input_length=MAX_SEQUENCE_LEN)) 

if USE_LSTM:
    model.add(Bidirectional(LSTM(32, return_sequences=True, dropout=0.1)))
    model.add(Bidirectional(LSTM(32, dropout=0.1)))
else:
    model.add(Flatten())
    model.add(Dense(250, activation='relu'))
    
model.add(Dense(1, activation='sigmoid')) 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
print(model.summary()) 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 500, 32)           160000    
_________________________________________________________________
bidirectional_7 (Bidirection (None, 500, 64)           16640     
_________________________________________________________________
bidirectional_8 (Bidirection (None, 64)                24832     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 201,537
Trainable params: 201,537
Non-trainable params: 0
_________________________________________________________________
None


In [23]:
model.fit(X_train, y_train, epochs=3, batch_size=32, validation_split=0.1)
model.evaluate(X_test, y_test)

Train on 22500 samples, validate on 2500 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


[0.3252795199871063, 0.86184]

In [18]:
#
# LSTM and GLOVE (fine-tune)
#

USE_LSTM = True
USE_GLOVE = True

# Build the model 
model = Sequential()

if USE_GLOVE:
    model.add(glove_embedding)
else:
    embedding_vector_length = 32
    model.add(Embedding(MAX_NUM_WORDS, embedding_vector_length, input_length=MAX_SEQUENCE_LEN)) 

if USE_LSTM:
    model.add(Bidirectional(LSTM(32, return_sequences=True, dropout=0.1)))
    model.add(Bidirectional(LSTM(32, dropout=0.1)))
else:
    model.add(Flatten())
    model.add(Dense(250, activation='relu'))
    
model.add(Dense(1, activation='sigmoid')) 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
print(model.summary()) 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 100)          500100    
_________________________________________________________________
bidirectional_5 (Bidirection (None, 500, 64)           34048     
_________________________________________________________________
bidirectional_6 (Bidirection (None, 64)                24832     
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 65        
Total params: 559,045
Trainable params: 559,045
Non-trainable params: 0
_________________________________________________________________
None


In [19]:
model.fit(X_train, y_train, epochs=3, batch_size=32, validation_split=0.1)

Train on 22500 samples, validate on 2500 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x140652320>

In [20]:
model.evaluate(X_test, y_test)



[0.2923556010341644, 0.88252]

In [14]:
#
# LSTM and GLOVE (pre-trained, constant)
#

USE_LSTM = True
USE_GLOVE = True

# Build the model 
model = Sequential()

if USE_GLOVE:
    model.add(glove_embedding)
else:
    embedding_vector_length = 32
    model.add(Embedding(MAX_NUM_WORDS, embedding_vector_length, input_length=MAX_SEQUENCE_LEN)) 

if USE_LSTM:
    model.add(Bidirectional(LSTM(32, return_sequences=True, dropout=0.1)))
    model.add(Bidirectional(LSTM(32, dropout=0.1)))
else:
    model.add(Flatten())
    model.add(Dense(250, activation='relu'))
    
model.add(Dense(1, activation='sigmoid')) 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) 
print(model.summary()) 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 100)          500100    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 500, 64)           34048     
_________________________________________________________________
bidirectional_4 (Bidirection (None, 64)                24832     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 559,045
Trainable params: 58,945
Non-trainable params: 500,100
_________________________________________________________________
None


In [15]:
model.fit(X_train, y_train, epochs=3, batch_size=32, validation_split=0.1)

Train on 22500 samples, validate on 2500 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x10cac34a8>

In [16]:
model.evaluate(X_test, y_test)



[0.5239130891036987, 0.73788]