# LSTM for sequence classification

In [64]:
import lstm_utils
import importlib
importlib.reload(lstm_utils)
from lstm_utils import *

## Loading the IMDB dataset

We're going to look at the IMDB dataset, which contains movie reviews from IMDB, along with their sentiment. Keras comes with some helpers for this dataset.

In [107]:
from keras.datasets import imdb
# fix random seed for reproducibility
np.random.seed(3)
# load the dataset but only keep the top n words, zero the rest
top_words = 5000
(X_train, y_trn), (X_test, y_test) = imdb.load_data(num_words=top_words)
# truncate and pad input sequences
max_review_length = 500

In [108]:
#X_train[0]

In [109]:
trn = sequence.pad_sequences(X_train, maxlen=max_review_length)
test = sequence.pad_sequences(X_test, maxlen=max_review_length)

In [41]:
idx = imdb.get_word_index()
idx_arr = sorted(idx, key=idx.get)
idx_arr[:10]

['the', 'and', 'a', 'of', 'to', 'is', 'br', 'in', 'it', 'i']

...and this is the mapping from id to word

In [48]:
idx2word = {v: k for k, v in idx.items()}

# Defining the model

In [89]:
# getting pretrained word embeddings 
#The pickled files from fast.ai are not working. I pickled them again and worked
#glove_path = get_glove_dataset()

In [87]:
glove_path = "/data/yinterian/Glove/6B.50d"

This function creates an array of work embeddings for our IMDB datset. It is using Glove embedings. You can find more about Glove embeddings here:
https://nlp.stanford.edu/projects/glove/

In [100]:
emb = create_emb(top_words, glove_path, idx2word)

In [102]:
emb.shape

(5000, 50)

In [106]:
vocab_size = top_words
seq_len = max_review_length

model = Sequential([
    Embedding(vocab_size, 50, input_length=seq_len, mask_zero=True,
              weights=[emb], trainable=False),
    LSTM(100, consume_less='gpu'),
    Dense(1, activation='sigmoid')])
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [110]:
model.fit(trn, y_train, validation_data=(test, y_test), epochs=2, batch_size=64)



Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f9ea81a8b38>

In [119]:
vocab_size = top_words
seq_len = max_review_length

model2 = Sequential([
    Embedding(vocab_size, 50, input_length=seq_len, mask_zero=True,
              weights=[emb], W_regularizer=l2(1e-6), dropout=0.2),
    LSTM(100, consume_less='gpu'),
    Dense(1, activation='sigmoid')])
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



In [120]:
model2.fit(trn, y_train, validation_data=(test, y_test), epochs=2, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f9f3629ccf8>

In [121]:
model2.optimizer.lr.assign(0.0001)

<tf.Tensor 'Assign_88:0' shape=() dtype=float32_ref>

In [122]:
model2.fit(trn, y_train, validation_data=(test, y_test), epochs=2, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f9f37cadef0>

try recurrent_dropout=0.2

In [135]:
vocab_size = top_words
seq_len = max_review_length

inputs = Input(shape=(seq_len,), dtype='int32')
x = Embedding(vocab_size, 50, input_length=seq_len, mask_zero=True,
              weights=[emb])(inputs)
x = Dropout(0.2)(x)
x = LSTM(100, implementation=2)(x)
x = Dense(1, activation='sigmoid')(x)
model = Model(inputs=inputs, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(trn, y_train, validation_data=(test, y_test), epochs=2, batch_size=64)

Train on 25000 samples, validate on 25000 samples
Epoch 1/2
Epoch 2/2